// CuNNy faster - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.


//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-0002
//!USE MulAdd
//!CAPABILITY FP16

#include "../StubDefs.hlsli"

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, x, y) t.SampleLevel(SP, pos + float2(x, y) * pt, 0)
#define V4 MF4
#define M4 MF4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T3;

//!PASS 1
//!DESC in (1x8)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT T0, T1

#define L0(x, y) MF(dot(MF3(0.299, 0.587, 0.114), O(INPUT, x, y).rgb))

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	MF s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2;
	V4 r0 = 0.0, r1 = 0.0;
	r0 = V4(-1.467e-03, -2.492e-04, -6.573e-04, -6.401e-04);
	r1 = V4(-4.736e-03, 7.443e-03, 2.352e-03, -8.863e-04);
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	r0 = mad(s0_0_0, V4(-3.649e-02, 6.494e-03, 6.929e-03, -1.320e-02), r0);
	r1 = mad(s0_0_0, V4(-2.453e-01, 2.722e-02, -7.841e-02, -8.301e-01), r1);
	r0 = mad(s0_0_1, V4(-5.663e-02, 2.213e-03, -9.981e-03, 7.036e-03), r0);
	r1 = mad(s0_0_1, V4(-2.193e-01, 5.934e-04, -3.767e-02, -6.469e-02), r1);
	r0 = mad(s0_0_2, V4(-5.772e-02, -7.180e-03, 4.832e-03, -4.077e-03), r0);
	r1 = mad(s0_0_2, V4(3.104e-02, 3.213e-03, 8.831e-02, -1.925e-02), r1);
	r0 = mad(s0_1_0, V4(-1.444e-01, 2.094e-03, -8.605e-01, -3.923e-02), r0);
	r1 = mad(s0_1_0, V4(1.032e-01, 4.432e-02, 3.857e-01, 8.655e-01), r1);
	r0 = mad(s0_1_1, V4(7.103e-01, 8.262e-01, 8.574e-01, 6.191e-01), r0);
	r1 = mad(s0_1_1, V4(8.105e-01, -8.463e-02, -5.097e-01, 4.015e-02), r1);
	r0 = mad(s0_1_2, V4(-8.078e-02, -1.313e-01, -2.669e-04, 3.881e-02), r0);
	r1 = mad(s0_1_2, V4(-1.696e-01, -3.874e-02, 1.460e-01, 1.208e-02), r1);
	r0 = mad(s0_2_0, V4(-5.270e-02, -9.660e-03, 3.139e-03, 5.236e-02), r0);
	r1 = mad(s0_2_0, V4(8.130e-02, 2.059e-01, 1.882e-01, -3.923e-02), r1);
	r0 = mad(s0_2_1, V4(-1.171e-01, -7.569e-01, 9.770e-04, -6.428e-02), r0);
	r1 = mad(s0_2_1, V4(-2.483e-01, 3.656e-02, -2.046e-01, 3.488e-02), r1);
	r0 = mad(s0_2_2, V4(1.227e-02, 6.710e-02, -5.071e-03, 3.497e-02), r0);
	r1 = mad(s0_2_2, V4(-1.450e-01, 6.362e-02, 2.222e-02, -1.415e-03), r1);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
}

//!PASS 2
//!DESC conv1 (8x8)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T0, T1
//!OUT T2, T3

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-2.745e-02, -2.207e-01, 1.935e-01, -1.900e-01, 2.286e-01, 2.744e-01, -8.664e-02, 1.118e-01, -4.883e-02, 9.057e-02, -4.562e-02, 1.048e-01, -9.133e-01, 6.688e-02, -6.078e-02, 3.743e-01), r0);
	r1 = MulAdd(s0_0_0, M4(-1.947e-01, -2.976e-02, -1.775e-02, -1.307e-01, 5.090e-01, 3.405e-01, 4.068e-02, 8.503e-02, 8.089e-02, 1.247e-02, 5.559e-02, 4.655e-02, 3.293e-01, 2.122e-01, 5.199e-01, 1.024e-03), r1);
	r0 = MulAdd(s0_0_1, M4(-3.874e-02, -2.083e-01, 7.888e-02, 1.068e-01, 6.695e-01, -3.078e-02, 2.589e-01, -4.537e-01, -3.048e-02, 2.744e-01, -8.475e-02, 3.597e-01, -5.918e-01, 2.407e-01, -3.240e-01, -2.955e-01), r0);
	r1 = MulAdd(s0_0_1, M4(-8.418e-02, 1.246e-03, -1.387e-01, -9.880e-02, 5.665e-01, 4.463e-01, -4.628e-02, -3.749e-01, 3.047e-01, 3.018e-01, 4.826e-01, -4.538e-02, 3.256e-01, 1.067e-01, -8.467e-02, 1.222e-01), r1);
	r0 = MulAdd(s0_0_2, M4(-6.977e-02, -1.212e-02, -1.844e-01, 7.854e-02, 7.446e-02, -2.596e-02, 1.750e-01, -1.119e-01, 1.223e-01, 2.946e-02, 2.369e-01, -1.947e-02, 6.332e-02, -3.517e-02, -3.114e-01, 4.048e-02), r0);
	r1 = MulAdd(s0_0_2, M4(-1.367e-01, -4.050e-02, 6.962e-02, 2.745e-01, 1.526e-02, 3.480e-02, -8.861e-02, -4.383e-01, 4.392e-02, 9.908e-02, 2.106e-01, -2.098e-01, 6.691e-02, -4.208e-02, -1.477e-01, 2.624e-01), r1);
	r0 = MulAdd(s0_1_0, M4(1.160e+00, -3.447e-01, 1.431e-01, -6.233e-01, -3.682e-01, 5.971e-02, 1.294e-01, -7.259e-02, -5.681e-01, 1.449e-01, 2.995e-02, 2.269e-01, 6.139e-01, -1.479e-01, -1.879e-01, 9.096e-01), r0);
	r1 = MulAdd(s0_1_0, M4(-3.301e-01, -8.343e-02, -4.568e-02, 7.738e-02, 6.387e-01, 5.020e-01, -7.634e-02, 2.607e-01, 3.192e-02, 3.449e-02, 1.919e-01, -6.372e-02, -2.151e-01, 4.512e-01, 6.904e-02, -2.574e-01), r1);
	r0 = MulAdd(s0_1_1, M4(-4.517e-01, -1.243e-01, -2.165e-01, -6.066e-01, 4.020e-01, 2.854e-03, 6.640e-02, 3.243e-03, -5.373e-01, 3.836e-01, 8.138e-02, 4.463e-01, 4.095e-01, 7.528e-01, 1.965e-01, 3.183e-01), r0);
	r1 = MulAdd(s0_1_1, M4(-6.218e-01, -4.830e-01, -9.349e-02, 5.566e-01, -2.872e-01, 3.838e-01, -1.341e-01, 5.983e-01, 9.584e-01, 6.938e-01, -2.568e-01, -3.701e-01, 5.510e-02, -7.565e-01, -1.861e-01, -4.291e-01), r1);
	r0 = MulAdd(s0_1_2, M4(6.486e-02, 1.377e-01, -3.370e-02, -2.791e-02, -1.140e-02, -9.545e-02, 2.359e-01, 3.221e-02, 5.838e-02, 7.187e-02, 8.145e-01, -4.951e-01, -8.723e-02, -4.000e-01, 4.580e-01, -5.536e-01), r0);
	r1 = MulAdd(s0_1_2, M4(1.394e-01, 8.523e-02, 1.672e-01, 1.163e-01, -1.975e-02, 1.853e-01, 3.929e-02, 5.932e-01, 7.156e-02, 2.065e-01, 1.014e-01, -8.406e-01, -2.946e-01, 1.205e-02, -1.384e-01, -9.426e-02), r1);
	r0 = MulAdd(s0_2_0, M4(1.578e-01, 3.479e-01, -4.638e-02, -1.476e-02, -2.450e-02, -2.310e-01, 3.853e-02, -3.572e-02, -2.824e-01, -1.821e-01, 2.635e-02, 1.115e-02, 4.829e-01, -1.333e-02, 1.353e-01, -1.404e-01), r0);
	r1 = MulAdd(s0_2_0, M4(4.062e-01, -7.839e-02, -1.528e-01, 1.463e-01, -2.094e-01, 5.457e-02, 4.655e-02, 3.771e-02, -8.435e-03, -2.585e-03, 2.200e-01, -2.484e-02, 2.209e-02, 5.240e-01, -6.499e-02, 3.130e-01), r1);
	r0 = MulAdd(s0_2_1, M4(-1.333e-01, 2.201e-01, -1.312e-01, 2.972e-01, -7.875e-02, 1.708e-02, 1.902e-02, 5.143e-02, -1.431e-01, -5.507e-01, -9.084e-02, -3.752e-02, 1.752e-01, -3.599e-01, 8.160e-02, -1.351e-01), r0);
	r1 = MulAdd(s0_2_1, M4(6.348e-01, 2.734e-02, 1.104e-02, -4.207e-01, -7.792e-02, -1.760e-02, 1.204e-01, 2.692e-01, -5.941e-01, 1.669e-01, 6.509e-02, 3.877e-01, -4.454e-01, -2.498e-01, -8.765e-02, 3.905e-01), r1);
	r0 = MulAdd(s0_2_2, M4(4.501e-02, 7.349e-02, -2.666e-01, 9.839e-02, 1.803e-02, 1.300e-02, -6.910e-02, 2.326e-02, 3.095e-02, -1.059e-01, 6.958e-02, -1.489e-01, -3.644e-02, 8.288e-02, 1.284e-01, 7.309e-02), r0);
	r1 = MulAdd(s0_2_2, M4(1.741e-01, -4.271e-02, 6.756e-03, -3.452e-01, -8.228e-02, -1.461e-02, 5.054e-03, 3.622e-01, -1.589e-01, 1.089e-01, 8.572e-02, 1.207e+00, 2.484e-02, -9.725e-02, 5.219e-02, -4.567e-01), r1);
	r0 = MulAdd(s1_0_0, M4(-1.583e-02, -3.945e-02, -4.681e-02, 3.724e-02, -5.691e-01, 2.831e-01, 1.781e-01, 3.673e-01, 1.344e-01, -2.381e-01, -3.967e-02, -5.579e-02, 1.124e-02, 4.429e-02, 1.785e-02, -7.294e-03), r0);
	r1 = MulAdd(s1_0_0, M4(-1.249e-02, -1.271e-02, -5.509e-02, -1.618e-01, 5.631e-02, 1.353e-01, 5.989e-01, -3.549e-01, -3.291e-02, 1.055e-02, -3.570e-01, 2.014e-03, 2.559e-02, 2.520e-02, 1.014e-01, 4.776e-02), r1);
	r0 = MulAdd(s1_0_1, M4(-6.841e-02, -3.330e-01, 1.491e-01, -2.479e-01, 8.614e-01, 2.899e-01, -1.464e-01, -4.568e-01, -1.016e-01, 1.217e-02, 8.128e-02, -1.909e-01, 8.666e-02, 1.567e-01, -4.480e-02, 9.731e-02), r0);
	r1 = MulAdd(s1_0_1, M4(-3.779e-01, -1.597e-01, -1.078e-01, -2.329e-01, -2.164e-01, -5.395e-02, 4.674e-02, -7.412e-01, -5.047e-02, 6.253e-03, 3.330e-03, 1.284e-01, 9.175e-02, 2.142e-02, 5.378e-03, 7.153e-02), r1);
	r0 = MulAdd(s1_0_2, M4(4.816e-02, -4.240e-02, 7.788e-02, -2.622e-02, -2.669e-02, 1.150e-01, 5.443e-01, 6.535e-01, 1.021e-01, -6.372e-02, -1.450e-01, -1.939e-01, -2.537e-02, 1.151e-01, -4.627e-02, 5.969e-02), r0);
	r1 = MulAdd(s1_0_2, M4(-3.323e-02, -1.189e-02, -4.776e-02, -1.254e-01, 1.630e-02, -4.824e-03, 3.465e-01, -1.163e-01, -4.700e-02, -7.449e-03, -1.178e-01, 2.407e-01, 1.349e-01, 5.684e-02, 8.430e-02, -4.766e-03), r1);
	r0 = MulAdd(s1_1_0, M4(3.267e-01, 1.082e-01, 6.145e-02, -2.367e-01, -6.572e-01, -2.386e-01, 1.232e-01, -2.354e-01, 2.557e-01, 5.896e-02, -6.426e-02, -2.649e-01, -8.893e-02, -5.766e-02, -2.966e-03, 5.249e-02), r0);
	r1 = MulAdd(s1_1_0, M4(-8.711e-02, 1.077e-04, -1.323e-01, -2.155e-02, 3.191e-01, -1.654e-01, 3.363e-01, 3.341e-01, -5.586e-02, 6.453e-02, -7.851e-02, 1.353e-01, 1.495e-02, 2.910e-02, 3.749e-02, -4.560e-03), r1);
	r0 = MulAdd(s1_1_1, M4(-1.276e-01, 5.603e-02, 4.266e-02, -2.442e-01, -1.999e-01, -6.974e-01, -4.758e-01, -8.425e-01, -6.083e-01, 1.831e-01, -1.002e-01, 2.716e-03, -7.994e-01, 4.148e-01, -2.609e-02, 3.311e-01), r0);
	r1 = MulAdd(s1_1_1, M4(-1.617e-01, -4.843e-01, -4.719e-02, 4.770e-02, 1.218e-01, 8.793e-02, -9.632e-01, 2.590e-01, 8.878e-04, -3.296e-02, 3.320e-01, 2.555e-01, 2.194e-01, 3.115e-01, 8.496e-01, -1.949e-01), r1);
	r0 = MulAdd(s1_1_2, M4(1.427e-02, -1.316e-01, 9.460e-02, -1.718e-01, -1.585e-01, -2.074e-01, 9.441e-04, -3.306e-01, -1.671e-01, 3.161e-02, -1.101e-01, 1.125e-01, -2.796e-01, 3.944e-01, -3.721e-01, 3.283e-01), r0);
	r1 = MulAdd(s1_1_2, M4(-1.260e-01, -8.491e-02, -7.202e-02, 1.288e-01, -1.071e-01, -4.142e-02, -1.240e-01, 5.697e-01, -2.579e-02, -7.446e-03, 9.417e-02, -3.176e-01, 4.365e-01, 1.960e-01, 1.096e-01, -2.448e-01), r1);
	r0 = MulAdd(s1_2_0, M4(2.348e-01, -1.035e-01, 1.214e-03, -9.820e-02, -2.979e-01, -1.009e-01, 1.762e-03, 2.216e-02, 2.369e-01, 2.364e-01, -4.041e-02, 1.902e-01, -1.076e-01, -3.857e-02, 1.176e-02, 1.821e-03), r0);
	r1 = MulAdd(s1_2_0, M4(1.106e-01, 6.168e-03, -8.660e-02, -9.408e-02, -4.449e-02, -1.545e-01, 1.303e-01, -4.042e-02, 2.602e-02, 6.939e-02, -2.552e-01, -7.157e-02, -4.227e-02, 4.203e-02, 1.665e-01, -6.815e-03), r1);
	r0 = MulAdd(s1_2_1, M4(1.327e-01, -1.606e-01, 6.922e-02, -1.945e-01, 1.564e-01, 4.677e-02, -8.247e-02, -1.052e-01, 1.166e-01, 2.031e-01, 2.183e-01, 3.076e-01, -1.039e+00, -7.060e-02, -1.160e-01, 4.229e-01), r0);
	r1 = MulAdd(s1_2_1, M4(9.163e-02, 6.150e-02, -1.353e-01, -4.893e-01, 1.450e-01, 2.789e-01, -3.849e-02, 2.974e-01, 1.348e-01, 9.801e-02, 7.791e-02, -3.330e-01, -6.113e-01, -3.717e-02, 2.095e-01, -2.653e-02), r1);
	r0 = MulAdd(s1_2_2, M4(-9.045e-02, 9.807e-03, 7.241e-02, 1.403e-03, 1.353e-01, 1.390e-01, -1.117e-01, 9.984e-02, 2.738e-01, 2.320e-01, 4.406e-02, 9.207e-03, 1.703e-01, -7.247e-02, 1.095e-01, 1.438e-01), r0);
	r1 = MulAdd(s1_2_2, M4(-4.790e-03, -2.152e-02, -2.678e-02, -1.558e-01, 2.494e-02, -4.282e-02, -2.160e-02, 3.721e-02, 9.600e-02, 3.137e-02, -2.145e-03, 6.140e-02, 2.852e-01, 3.834e-01, 2.035e-01, 1.803e-02), r1);
	r0 = max(r0, 0.0);
	T2[gxy] = r0;
	r1 = max(r1, 0.0);
	T3[gxy] = r1;
}

//!PASS 3
//!DESC conv2 (8x8)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T2, T3
//!OUT T0, T1

#define L0(x, y) V4(O(T2, x, y))
#define L1(x, y) V4(O(T3, x, y))

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(3.721e-01, -2.400e-02, -1.956e-01, -2.366e-02, -9.801e-01, 3.329e-01, 2.192e-01, 1.466e-02, 5.032e-01, 4.557e-01, 1.382e-01, 6.692e-03, -6.218e-01, 3.035e-03, -9.975e-02, 2.041e-02), r0);
	r1 = MulAdd(s0_0_0, M4(-1.125e-01, -1.964e-02, -3.158e-02, -7.642e-02, 1.214e-02, 2.403e-02, 2.518e-03, 5.992e-02, -3.808e-02, -7.547e-02, 2.046e-02, 9.350e-02, -1.091e-02, -4.339e-02, -1.240e-03, 3.815e-02), r1);
	r0 = MulAdd(s0_0_1, M4(-1.061e+00, -1.985e-01, -9.808e-01, -2.885e-02, -3.455e+00, -5.276e-02, 1.655e-01, -1.392e-01, -5.314e-01, 3.927e-03, 1.428e-02, -1.066e-01, 1.385e+00, 1.430e-02, 5.197e-02, 1.058e-02), r0);
	r1 = MulAdd(s0_0_1, M4(2.568e-01, 5.616e-02, -2.189e-02, -6.037e-02, 1.284e-01, -1.030e-01, -3.021e-02, -2.181e-01, -1.265e-01, -3.797e-01, 3.796e-02, 4.211e-02, -2.102e-01, -3.914e-02, 1.532e-02, 7.398e-02), r1);
	r0 = MulAdd(s0_0_2, M4(-4.630e+00, 5.360e-02, -9.901e-02, 6.749e-02, -3.970e+00, -7.701e-03, 7.287e-03, -2.363e-03, -5.005e+00, 1.976e-02, -2.377e-03, 3.549e-03, -5.780e+00, -1.170e-01, -8.632e-03, 8.844e-03), r0);
	r1 = MulAdd(s0_0_2, M4(-1.027e+00, -4.827e-02, 1.986e-03, -2.380e-01, 2.717e-01, 9.447e-02, -3.643e-01, -7.834e-02, -3.782e-02, -1.204e-01, -5.315e-02, 2.084e-02, 8.963e-03, 3.009e-02, 1.096e-01, 2.298e-02), r1);
	r0 = MulAdd(s0_1_0, M4(3.437e-01, 7.051e-02, -1.714e-01, 7.105e-02, 8.972e-02, 3.610e-01, -1.654e-01, -1.354e-01, 1.456e-01, -4.150e-01, 6.514e-01, -4.283e-02, -1.893e-01, -5.605e-01, 2.214e-01, -3.887e-03), r0);
	r1 = MulAdd(s0_1_0, M4(-2.589e-01, 3.982e-02, -2.441e-02, -1.366e-01, 5.284e-02, -6.163e-02, -2.746e-02, -1.043e-01, -6.777e-01, 3.831e-02, -4.013e-02, -3.283e-01, -2.101e-01, 5.011e-02, 5.132e-03, -8.595e-02), r1);
	r0 = MulAdd(s0_1_1, M4(2.334e-02, -2.941e+01, -1.112e+00, -4.133e-01, 2.986e-01, 2.569e-02, 1.114e-01, 4.450e-01, 8.375e-01, 9.858e-02, -2.843e-02, -1.439e-01, -2.625e-01, 7.243e-01, -7.802e-02, -2.181e-01), r0);
	r1 = MulAdd(s0_1_1, M4(8.222e-01, 1.408e-01, 1.489e-01, 4.876e-01, -2.979e-01, -2.124e-01, -1.848e-01, 1.382e-01, 9.510e-01, 2.895e-01, -2.940e-01, -3.275e-03, 5.967e-01, 2.665e-01, 1.332e-01, 5.154e-02), r1);
	r0 = MulAdd(s0_1_2, M4(-2.033e-02, -4.102e-02, -2.331e-01, -7.625e-03, 1.539e-01, 1.332e-01, 6.752e-03, -2.610e-02, -2.315e-01, -3.520e-02, -2.606e-02, 3.162e-02, 3.172e-01, -2.522e-01, -3.822e-02, 1.669e-02), r0);
	r1 = MulAdd(s0_1_2, M4(-1.607e+00, -4.652e-01, -9.161e-01, -9.656e-01, -8.619e-02, 4.011e-02, 5.215e-01, 2.646e-01, -2.163e-01, -3.428e-01, -2.179e-01, -7.829e-02, -3.517e-01, -1.967e-01, -4.287e-01, -1.498e-01), r1);
	r0 = MulAdd(s0_2_0, M4(-1.786e-01, 9.090e-02, -2.610e-03, -5.163e-02, 6.480e-02, -3.163e-02, 1.155e-01, -8.818e-03, 2.394e-01, -5.334e-02, 2.987e-01, 5.863e-02, 3.900e-02, 4.309e-02, -2.314e-01, -6.592e-02), r0);
	r1 = MulAdd(s0_2_0, M4(-2.722e-02, 1.052e-01, 8.269e-03, -5.500e-02, -3.936e-03, 5.446e-03, -1.516e-02, 5.667e-03, -1.344e-02, 5.689e-02, 5.689e-03, -1.548e-01, 1.431e-02, -5.652e-02, -2.087e-02, -4.520e-02), r1);
	r0 = MulAdd(s0_2_1, M4(-1.032e-01, -1.703e-01, 6.614e-02, -4.873e-01, -2.534e-01, 9.399e-02, 2.212e-01, -3.525e-01, -1.027e-01, 1.957e-01, -1.243e-01, 4.235e-02, -9.614e-04, 1.865e-01, -6.629e-02, -1.868e-01), r0);
	r1 = MulAdd(s0_2_1, M4(1.028e-01, -1.106e-01, -7.235e-02, 4.743e-02, 7.260e-02, 1.035e-01, 3.983e-02, -3.822e-02, 2.064e-01, 5.926e-02, 2.607e-01, 4.834e-01, -2.974e-01, -3.156e-01, -1.608e-01, -1.681e-01), r1);
	r0 = MulAdd(s0_2_2, M4(-1.451e-01, -3.157e-02, 9.779e-02, -1.622e-02, -2.159e-01, -4.299e-02, -1.642e-02, -2.655e-02, -1.737e-01, -5.800e-02, -7.301e-02, 1.616e-02, -4.006e-01, -1.319e-01, -8.343e-02, 3.723e-02), r0);
	r1 = MulAdd(s0_2_2, M4(3.804e-02, 3.039e-01, -4.726e-02, -3.090e-01, 1.665e-01, 1.750e-01, -4.054e-02, 5.676e-02, -1.217e-01, -7.191e-02, 1.771e-02, -1.154e-01, -2.791e-02, -1.646e-01, -1.121e-01, -3.843e-02), r1);
	r0 = MulAdd(s1_0_0, M4(-7.360e-02, -9.049e-02, -8.863e-02, 3.107e-03, 3.522e-03, 1.528e-01, 1.353e-01, -1.450e-02, 2.771e-02, -2.253e-01, -1.361e-01, -6.045e-03, -2.504e-02, 5.064e-02, -1.548e-01, 2.374e-02), r0);
	r1 = MulAdd(s1_0_0, M4(-3.736e-02, -1.712e-03, 1.067e-02, -4.615e-03, 8.765e-02, -2.028e-02, -2.360e-02, -3.386e-02, -4.990e-02, -1.162e-02, 5.282e-03, -2.086e-02, 9.336e-02, 1.060e-02, -1.951e-02, -6.118e-02), r1);
	r0 = MulAdd(s1_0_1, M4(-8.324e-02, 2.494e-02, -2.002e-01, 4.015e-02, 2.385e-01, -6.613e-02, 2.512e-01, -6.451e-02, 1.712e-01, -9.852e-02, -2.800e-01, 7.153e-02, -9.909e-01, -3.173e-02, 1.414e-02, -8.374e-02), r0);
	r1 = MulAdd(s1_0_1, M4(-4.612e-02, 2.401e-02, 6.899e-02, 2.905e-01, 6.444e-02, 3.205e-02, -1.362e-01, -4.326e-01, -9.494e-02, -6.820e-03, 6.421e-02, 1.714e-01, -5.901e-01, -9.780e-02, 9.027e-03, 1.887e-02), r1);
	r0 = MulAdd(s1_0_2, M4(-6.049e+00, 7.153e-02, -5.054e-03, 2.387e-02, -8.097e+00, -8.952e-02, -6.126e-03, -4.407e-02, -1.088e+01, 2.262e-02, -1.584e-02, -4.606e-03, -7.515e+00, 8.446e-02, 1.333e-02, -1.092e-02), r0);
	r1 = MulAdd(s1_0_2, M4(-4.014e-01, -2.529e-01, 2.178e-01, 8.960e-02, 4.076e-01, 2.650e-01, -2.803e-01, -8.337e-02, -4.150e-01, -1.169e-01, 2.159e-01, 2.209e-02, 7.263e-02, -5.167e-02, -4.701e-02, 7.609e-03), r1);
	r0 = MulAdd(s1_1_0, M4(4.563e-02, -4.523e-01, -1.023e-01, 2.913e-02, -4.190e-02, 9.824e-01, 3.403e-01, -2.963e-03, 5.346e-02, -5.637e-01, -1.136e-01, -9.705e-03, -1.695e-01, -1.431e-01, -3.994e-01, -2.549e-01), r0);
	r1 = MulAdd(s1_1_0, M4(5.094e-02, 5.899e-02, 7.176e-03, 7.392e-02, -7.962e-02, -1.056e-01, 1.996e-02, 4.597e-02, -5.665e-02, 9.646e-02, -6.105e-03, -2.696e-02, -6.616e-02, -2.239e-02, 1.726e-02, 5.942e-02), r1);
	r0 = MulAdd(s1_1_1, M4(-3.702e-01, -4.155e-01, -1.366e-01, -5.453e-01, -6.551e-01, -4.287e-01, 2.979e-02, 7.101e-01, -4.266e-02, -3.441e-01, -1.953e-01, -3.330e-01, -4.962e-01, -4.581e-02, -7.599e-02, -1.879e-01), r0);
	r1 = MulAdd(s1_1_1, M4(-1.928e-01, -4.612e-02, -6.519e-02, -3.969e-01, 6.230e-01, -8.522e-02, 9.952e-02, 8.184e-01, -2.250e-01, 8.927e-02, -3.654e-03, -2.698e-01, -2.588e-01, -3.297e-01, -4.873e-01, -7.559e-01), r1);
	r0 = MulAdd(s1_1_2, M4(-3.205e-01, 2.685e-01, -1.573e-02, 1.145e-01, 7.271e-01, -2.939e-01, -3.419e-02, -2.347e-02, -5.742e-01, 3.567e-01, 1.567e-01, -5.314e-02, -5.630e-02, 4.847e-02, 4.357e-02, -7.304e-03), r0);
	r1 = MulAdd(s1_1_2, M4(8.431e-02, -7.407e-01, -5.968e-01, -2.895e-01, 3.057e-01, 1.355e+00, 1.218e+00, 3.684e-01, 4.405e-01, -1.468e-01, -4.819e-01, -2.115e-01, -8.326e-02, -2.803e-01, -2.881e-01, -8.513e-03), r1);
	r0 = MulAdd(s1_2_0, M4(4.779e-02, -1.131e-01, -4.024e-02, 4.553e-02, -1.352e-02, 1.175e-01, -2.165e-01, -5.019e-02, -6.223e-02, -2.675e-01, 2.218e-02, 2.798e-02, -1.053e-01, 3.073e-03, -1.362e-01, -2.476e-02), r0);
	r1 = MulAdd(s1_2_0, M4(5.614e-02, 4.595e-02, 1.443e-02, 1.143e-02, 8.510e-02, -1.797e-02, -1.004e-02, 3.894e-02, -2.742e-03, 4.209e-02, 1.094e-02, -2.318e-02, 4.537e-03, 1.701e-02, -9.907e-03, 3.756e-03), r1);
	r0 = MulAdd(s1_2_1, M4(3.562e-02, -1.879e-01, -1.339e-01, 4.551e-01, 4.446e-02, -7.805e-02, 9.978e-02, 1.169e-01, 2.374e-01, -1.717e-01, 5.298e-01, 7.449e-02, 2.152e-02, -8.717e-03, -6.392e-02, -2.474e-03), r0);
	r1 = MulAdd(s1_2_1, M4(-1.522e-01, -1.399e-02, 6.566e-03, 3.406e-02, -1.909e-01, -1.610e-01, -7.153e-02, -1.621e-01, 9.282e-05, 7.914e-02, 1.993e-02, -7.462e-02, -1.242e-01, -1.460e-01, -5.944e-02, -4.934e-02), r1);
	r0 = MulAdd(s1_2_2, M4(1.692e-01, 1.819e-01, 8.325e-02, 3.697e-02, -7.658e-03, -5.463e-03, -2.540e-02, -5.200e-03, 7.926e-02, 1.222e-01, 2.049e-01, -3.164e-02, 4.969e-02, -3.982e-03, -1.223e-02, 2.194e-04), r0);
	r1 = MulAdd(s1_2_2, M4(-1.815e-01, -1.559e-01, 1.636e-01, 9.129e-02, 5.644e-02, 7.938e-02, 4.851e-02, 1.213e-01, 2.705e-01, 3.824e-01, 2.329e-01, 2.939e-01, -3.185e-02, -5.969e-02, -3.864e-02, -3.748e-02), r1);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
}

//!PASS 4
//!DESC out-shuffle (8x4)
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, T0, T1
//!OUT OUTPUT

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 sz = GetOutputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = ((gxy >> 1) + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0;
	r0 = V4(1.178e-04, -5.913e-05, -9.275e-09, -1.228e-04);
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-1.244e-02, -8.758e-03, -8.863e-03, 2.117e-02, -9.270e-04, 5.927e-04, 1.009e-03, -1.716e-03, -9.253e-04, -1.111e-03, 1.800e-03, 1.738e-03, -1.496e-02, -1.273e-02, -1.486e-02, 7.423e-03), r0);
	r0 = MulAdd(s0_0_1, M4(3.739e-03, -9.298e-06, 1.274e-03, 3.710e-03, 3.239e-03, 1.140e-03, -1.923e-02, -4.003e-03, 3.644e-02, -8.557e-03, 9.162e-04, -2.231e-03, -5.309e-01, 1.193e-02, -4.997e-02, -2.020e-02), r0);
	r0 = MulAdd(s0_0_2, M4(6.792e-03, -1.583e-03, -1.545e-04, 2.123e-03, -1.343e-01, 1.223e-01, 2.373e-03, -8.276e-02, -4.049e-03, 1.254e-01, -1.080e-02, 1.894e-02, 9.075e-03, 1.250e-02, -3.510e-02, -1.125e-01), r0);
	r0 = MulAdd(s0_1_0, M4(1.235e-01, -3.503e-02, -3.821e-02, -3.128e-02, -5.355e-03, 4.795e-04, -7.349e-03, -4.585e-04, 5.054e-03, 8.673e-04, 1.080e-03, -6.774e-04, -1.712e-03, 1.244e-02, -5.717e-03, 4.947e-03), r0);
	r0 = MulAdd(s0_1_1, M4(-2.232e-02, 3.194e-01, -4.260e-02, 9.687e-03, 5.554e-02, -3.243e-02, 4.311e-02, -3.627e-02, 1.753e-01, 1.504e-02, 1.343e-01, 8.075e-03, 2.201e-02, 3.087e-04, 3.701e-01, 2.575e-01), r0);
	r0 = MulAdd(s0_1_2, M4(5.742e-03, -5.625e-02, 2.008e-02, -8.259e-03, -3.426e-01, 4.014e-01, -5.684e-01, 5.084e-01, -4.118e-01, 2.838e-02, -1.102e-01, 2.417e-01, -1.824e-03, -1.022e-02, -9.827e-03, 1.753e-01), r0);
	r0 = MulAdd(s0_2_0, M4(-2.962e-02, 3.611e-02, -2.826e-02, 7.162e-02, 3.191e-03, -2.836e-03, 3.641e-03, 4.525e-04, -3.387e-03, 1.218e-03, -2.024e-03, 1.153e-03, 3.745e-03, -9.613e-04, 1.151e-02, 4.385e-04), r0);
	r0 = MulAdd(s0_2_1, M4(-4.404e-01, 4.677e-02, -2.510e-01, 4.287e-01, -1.260e-02, 9.716e-04, 1.849e-02, 2.118e-03, 2.679e-02, -4.549e-03, 8.995e-02, -8.610e-03, -4.725e-03, -2.950e-03, -1.471e-02, -1.283e-02), r0);
	r0 = MulAdd(s0_2_2, M4(1.786e-02, -3.493e-02, 2.228e-02, -1.174e-01, 5.050e-02, -1.421e-02, 2.838e-02, 4.065e-02, -1.205e-02, -3.918e-02, -1.469e-01, -9.351e-02, 1.648e-03, -1.034e-03, 4.440e-03, 6.707e-03), r0);
	r0 = MulAdd(s1_0_0, M4(-2.142e-02, 6.180e-03, -4.312e-04, -1.255e-03, 1.440e-01, -3.725e-03, -9.258e-03, -2.704e-02, 5.318e-02, 2.435e-02, 3.821e-02, 2.240e-02, 3.649e-02, -1.872e-02, -5.139e-03, -2.301e-03), r0);
	r0 = MulAdd(s1_0_1, M4(1.169e-02, -4.560e-02, 3.319e-03, -1.149e-02, 1.058e-02, 1.033e-01, -8.919e-03, -3.206e-03, 1.938e-02, 4.199e-02, 1.024e-02, 2.606e-02, -6.607e-04, 9.595e-02, -1.618e-02, -9.020e-03), r0);
	r0 = MulAdd(s1_0_2, M4(-2.192e-04, -1.368e-03, 2.637e-04, 2.084e-03, 3.796e-04, 4.478e-03, 4.530e-03, 3.802e-03, 1.389e-03, 8.964e-03, -7.443e-03, -9.200e-03, -4.164e-03, -4.553e-02, 8.826e-03, 5.783e-03), r0);
	r0 = MulAdd(s1_1_0, M4(-3.189e-02, 1.554e-02, -2.356e-02, 6.188e-03, -3.738e-01, -1.353e-01, 3.467e-01, -7.778e-02, 3.385e-01, 1.687e-01, -1.031e+00, -5.774e-02, 8.364e-03, 6.707e-03, 4.895e-02, 3.747e-02), r0);
	r0 = MulAdd(s1_1_1, M4(2.261e-01, -5.996e-01, 1.353e-01, 2.720e-02, -1.738e-02, 6.702e-02, 2.513e-02, 2.757e-01, -2.204e-03, 8.863e-02, 4.658e-02, -1.606e-01, 3.429e-01, 3.604e-01, -4.408e-02, -8.637e-01), r0);
	r0 = MulAdd(s1_1_2, M4(-5.780e-03, 3.479e-02, -1.777e-03, -6.527e-04, -3.946e-03, -2.386e-02, 3.971e-03, 7.918e-04, -8.640e-05, 3.242e-02, -1.883e-03, 1.259e-02, 1.303e-02, -1.050e-02, -2.165e-02, -2.813e-02), r0);
	r0 = MulAdd(s1_2_0, M4(-4.737e-03, 4.114e-03, -5.121e-03, -5.830e-04, 1.718e-03, -3.928e-04, -1.548e-01, -5.817e-02, -2.471e-02, 2.727e-03, 1.538e-01, 9.399e-02, -3.287e-03, -2.909e-03, -2.165e-02, 1.371e-03), r0);
	r0 = MulAdd(s1_2_1, M4(1.177e-03, 3.208e-02, 9.937e-02, 2.223e-02, 7.648e-03, 1.837e-02, -2.497e-02, -2.802e-02, -1.218e-02, -2.563e-03, -3.208e-03, 3.593e-02, 2.020e-02, -9.020e-03, 1.183e-01, 9.113e-02), r0);
	r0 = MulAdd(s1_2_2, M4(-1.707e-03, -3.909e-03, 6.098e-03, 2.484e-03, 5.783e-04, 9.003e-03, -6.119e-03, -1.253e-02, -6.749e-04, -8.294e-03, 1.991e-03, 1.878e-02, 9.696e-04, -1.977e-03, 4.883e-03, 1.341e-02), r0);
	static const MF3x3 RY = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081}, YR = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
	float2 opt = float2(GetOutputPt()), fpos = (float2(gxy) + 0.5) * opt;
	MF3 yuv;
	yuv = mul(RY, INPUT.SampleLevel(SL, fpos + float2(0.0, 0.0) * opt, 0).rgb);
	OUTPUT[gxy + int2(0, 0)] = MF4(mul(YR, MF3(saturate(yuv.r + r0.x), yuv.yz)), 1.0);
	yuv = mul(RY, INPUT.SampleLevel(SL, fpos + float2(1.0, 0.0) * opt, 0).rgb);
	OUTPUT[gxy + int2(1, 0)] = MF4(mul(YR, MF3(saturate(yuv.r + r0.y), yuv.yz)), 1.0);
	yuv = mul(RY, INPUT.SampleLevel(SL, fpos + float2(0.0, 1.0) * opt, 0).rgb);
	OUTPUT[gxy + int2(0, 1)] = MF4(mul(YR, MF3(saturate(yuv.r + r0.z), yuv.yz)), 1.0);
	yuv = mul(RY, INPUT.SampleLevel(SL, fpos + float2(1.0, 1.0) * opt, 0).rgb);
	OUTPUT[gxy + int2(1, 1)] = MF4(mul(YR, MF3(saturate(yuv.r + r0.w), yuv.yz)), 1.0);
}
